show the code
library(tidyverse)
library(broom)
library(knitr)
library(DT)
library(kableExtra)
library(tinytex)
library(moderndive)library(tidyverse)
library(broom)
library(knitr)
library(DT)
library(kableExtra)
library(tinytex)
library(moderndive)baby_names_data <- read_csv(here::here("supporting_artifacts",
"extended learning",
"StateNames_A.csv"
)
)
baby_names_data |>
datatable()NA).baby_names <- baby_names_data |>
rename(sex_at_birth = `Gender`) |>
filter(`Name` == 'Allison')
baby_names |>
group_by(sex_at_birth,State) |>
summarise(sum = sum(`Count`), .groups = "drop") |>
pivot_wider(names_from = `sex_at_birth`,
values_from = `sum`
) |>
mutate(M = coalesce(M, 0)) |>
kable(format = "html",
col.names = c("State",
"Frequency of Babies Sex_at_Birth: F",
"Frequency of Babies Sex_at_Birth: M"
),
align = "ccc",
booktabs = TRUE
) |>
kable_styling(font_size = 12)| State | Frequency of Babies Sex_at_Birth: F | Frequency of Babies Sex_at_Birth: M |
|---|---|---|
| AK | 232 | 0 |
| AL | 1535 | 0 |
| AR | 1198 | 0 |
| AZ | 1880 | 0 |
| CA | 12413 | 0 |
| CO | 1594 | 0 |
| CT | 1099 | 0 |
| DC | 321 | 0 |
| DE | 294 | 0 |
| FL | 4455 | 0 |
| GA | 3257 | 0 |
| HI | 183 | 0 |
| IA | 1477 | 0 |
| ID | 451 | 0 |
| IL | 5110 | 0 |
| IN | 3067 | 0 |
| KS | 1283 | 0 |
| KY | 1905 | 20 |
| LA | 1209 | 0 |
| MA | 2218 | 0 |
| MD | 2229 | 0 |
| ME | 340 | 0 |
| MI | 4014 | 0 |
| MN | 2374 | 0 |
| MO | 2882 | 0 |
| MS | 817 | 0 |
| MT | 226 | 0 |
| NC | 3435 | 0 |
| ND | 285 | 0 |
| NE | 807 | 0 |
| NH | 412 | 0 |
| NJ | 3052 | 0 |
| NM | 399 | 0 |
| NV | 729 | 0 |
| NY | 5747 | 0 |
| OH | 5487 | 0 |
| OK | 1421 | 0 |
| OR | 1186 | 0 |
| PA | 4307 | 0 |
| RI | 306 | 0 |
| SC | 1228 | 0 |
| SD | 376 | 0 |
| TN | 2488 | 0 |
| TX | 10192 | 0 |
| UT | 1125 | 0 |
| VA | 3220 | 0 |
| VT | 135 | 0 |
| WA | 1956 | 0 |
| WI | 2367 | 0 |
| WV | 813 | 0 |
| WY | 142 | 0 |
source: https://sparkbyexamples.com/r-programming/replace-na-values-with-zero-in-r-dataframe/
You should have seen in the table above that “Allison” is a name given overwhelmingly to babies assigned female at birth. So, filter the data to include only babies assigned Female at birth.
Make a visualization showing how the popularity of the name “Allison” has changed over the years. To be clear, each year should have one observation–the total number of Allisons born that year.
baby_names <- baby_names|>
filter(`sex_at_birth` != 'M') |>
group_by(Year) |>
summarise(year_count = sum(`Count`), .groups = "drop")
baby_names |>
ggplot(mapping = aes(x = Year, y = year_count)) +
geom_bar(stat = "identity") +
geom_text(aes(label = year_count), color = "blue", vjust=0) +
ggtitle("Count of Babies named Allison \n with female sex_at_birth") +
labs(x = "Year", y = "")Create a linear model with the year as the explanatory variable, and the number of Allisons as the response. Similar to #4, each year should have one observation–the total number of Allisons born that year.
baby_names_model <- baby_names |>
lm(year_count ~ Year, data = _)Visualize the regression model.
baby_names |>
ggplot(aes(x = Year, y = year_count)) +
geom_point() +
geom_smooth(formula = y ~ x, method = "lm", se = FALSE) +
labs(x = "Year",
y = "") +
ggtitle('year_count ') Write out the estimated regression equation.
get_regression_table(baby_names_model) |>
kable(format = "html")| term | estimate | std_error | statistic | p_value | lower_ci | upper_ci |
|---|---|---|---|---|---|---|
| intercept | 209689.761 | 42971.505 | 4.880 | 0 | 118594.240 | 300785.282 |
| Year | -101.519 | 21.427 | -4.738 | 0 | -146.942 | -56.096 |
\[ \hat{y} = 209689.761- (101.519*Year) \]
\[ \hat{y} = \] predicted number of babies birthed in a year with the name Allison and sex_at_birth Female
Plot the residuals of the model, that is, the actual values minus the predicted values. Comment on the residuals - do you see any patterns?
baby_names_model |>
augment() |>
ggplot(aes(x = Year, y = .resid)) +
geom_point() +
ggtitle('Residuals') +
labs(x = "Year",
y = "")From the graph of the Residuals, I does appears that there is a U type pattern in the graph. This means that the residuals are likely not normally distributed. This does not satisfy the condition and therefore doesn’t satisfy all the linear model conditions.
Therefore, from this model we do not satisfy the normally distributed residuals condition therefore we can’t make a conclusion based on our linear model. From early analysis it may appear that the name Allison is becoming less popular over time, we can’t make any claims.
In middle school I was so upset with my parents for not naming me “Allyson”. Past my pre-teen rebellion, I’m happy with my name and am impressed when baristas spell it “Allison” instead of “Alison”. But I don’t have it as bad as my good friend Allan!
baby_names_p5 <- baby_names_data |>
rename(sex_at_birth = `Gender`) |>
filter(`Name` %in% c('Allan','Alan','Allen'),
`sex_at_birth` != 'F'
)
baby_names_p5 |>
group_by(Year,Name) |>
summarise(year_count = sum(`Count`), .groups = "drop") |>
ggplot(aes(x = Year, y = year_count, color = Name)) +
geom_point() +
ggtitle('year_count') +
labs(x = "Year", y = "")but perhaps it’s not such an unusual name for his home state of Pennsylvania. Compute the total number of babies born with each spelling of “Allan” in 2000, in Pennsylvania and in California. Specifically, each spelling should be its own column and each state should have its own row. Similar to before, a 0 (not an NA) should be used to represent locations where there were no instances of these names.
baby_names_p5 |>
filter(`State` %in% c("CA","PA"),
`Year` == 2000
) |>
group_by(State,Name) |>
summarise(year_count = sum(`Count`), .groups = "drop") |>
pivot_wider(names_from = `Name`,
values_from = `year_count`
) |>
kable(format = "html",
col.names = c("State",
"Frequency of Alan",
"Frequency of Allan",
"Frequency of Allen"),
align = "ccc",
booktabs = TRUE
) |>
kable_styling(font_size = 12)| State | Frequency of Alan | Frequency of Allan | Frequency of Allen |
|---|---|---|---|
| CA | 579 | 131 | 176 |
| PA | 51 | 12 | 56 |
baby_names_p5 |>
filter(`State` %in% c("CA","PA"),
`Year` == 2000
) |>
group_by(State,Name) |>
summarise(sum_cnt = sum(Count), .groups = "drop") |>
pivot_wider(names_from = `Name`,
values_from = `sum_cnt`
) |>
mutate(`row_sum` = `Alan` + `Allan` + `Allen`,
`Alan` = (`Alan`/`row_sum`)*100,
`Allan` = (`Allan`/`row_sum`)*100,
`Allen` = (`Allen`/`row_sum`)*100
) |>
select(-5) |>
kable(format = "html",
col.names = c("State",
"percent named Alan",
"percent named Allan",
"percent named Allen"),
align = "ccc",
booktabs = TRUE
) |>
kable_styling(font_size = 12)| State | percent named Alan | percent named Allan | percent named Allen |
|---|---|---|---|
| CA | 65.34989 | 14.78555 | 19.86456 |
| PA | 42.85714 | 10.08403 | 47.05882 |
After producing the table, the births of babies with the name Allan is still the most uncommon in both California and Pennsylvania.